% Data-Driven Identification of Prognostic Tumor Subpopulations Using 
% Spatially Mapped t-SNE of Mass Spectrometry Imaging Data
% ===============NOTES:=====================
% Note: Some parts of this pipeline use R implementation. Please, install R and a
% proper toolbox to connect R-and-Matlab.
% Note: We use the R Package samr to perform the SAM analysis, and it is integrated
% in this Matlab scripts (calling R from Matlab). So  the results may not be 
% prinited in a proper readable manner for you, then you can find the exciplicit 
% R implementation the attached file "SAM_GastricCancer_R.txt" 
% (if it is needed, then change the value of parameter delta (here, delta is corresponding to FDR <0.001))
% ******************** Terms and Conditions ***********************
% Please note that by using this implementation you agree on the terms of
% use:
%   1. The software may be used for research purposes only.
%   2. The software is for personal use only, and May not be redistributed.
%   3. In no event Shall the LUMC be liable for any direct or indirect damage, 
%      Arising in any way out of the use of this software.
%   4. Any publication Arising from the use of this implementation should
%   cite reference : W. Abdelmoula et al, "Data-Driven Identification of Prognostic Tumor Subpopulations Using
%                   Spatially Mapped t-SNE of Mass Spectrometry Imaging Data", PNAS, 2016.
%       
% =========================================================================
% Written By Walid M. Abdelmoula, LUMC, NL


%% Load dataset and include required functions
clear all, clc
load('breast_cancer_dataset.mat');
addpath('../Matlab_Files/Functions/');
addpath('../Matlab_Files/Functions/kaplanmeier_plot/');
addpath('../Matlab_Files/Functions/FisherExactTest/');
addpath('../Matlab_Files/RviaMatlab/');
%% 1. Dimensionality Reduction:
% Load Data and apply tSNE (results visualized in scatter and spatial spaces)
S = size(MSI_data_cube);
N_Patients = length(Clinical_data);
N_Masses = S(3);
Folded_Measurements = reshape(MSI_data_cube, [S(1)*S(2) S(3)]); 
Tumors_All = Folded_Measurements((goodlist>0),:);
indx = find((goodlist>0));
mappedX1_3D= fast_tsne_seed(Tumors_All,3);
% Repeat the tSNE with the previous initialization map: for reproducibility
% and global convergence
mappedX2_3D = fast_tsne_seed(Tumors_All, 3, [], [], [], [], mappedX1_3D);
lab_Coloring = embedding2LABcolormap(mappedX2_3D);
% Scatter visualization
figure,scatter3(mappedX2_3D(:,1),mappedX2_3D(:,2),mappedX2_3D(:,3),3,lab_Coloring);
% tSNE spatial image (colored using L*a*b* color system)
tSNE_LABSegmentationMap = Visualize_tSNE_2DImage(lab_Coloring,S(1),S(2),indx);
%% 2. Find the optimal number of clusters from the tSNE Space
% Find the optimal number of clusters from the tSNE-spatial image using Bisecting Kmeans.
% Note: In this run, we set k-iterations 2:10, however, you can change it in this function (Optimal_NumberClusters)
[K_Clusters, Corr_Values, IDX, C] = Optimal_NumberClusters(mappedX2_3D,S,indx);
[Ranked_Correlation, Ranked_Index] = sort(Corr_Values,'descend');
%% Select the Ranked peak( 1st, 2nd,...etc)
[Ranked_Correlation, Ranked_Index] = sort(Corr_Values,'descend');
Rank_Order = 1;
K_ranked = Ranked_Index(Rank_Order)
opt=[1E-6 1 1];
[rIDX,rC,rCovMat,rDmat,rCo]=MyKmeans(mappedX2_3D,K_ranked,opt);
[Kmeans_SegmentationMap, IDXs, Cs, Color_Scheme] = Visualize_combined_ClusteredImage(rIDX,rC,S,indx,K_ranked);

% assign_regions applies a threshold to check whether a subpopulation from
% a patient will be considered for further analysis to be associated with
% clinical analysis or it will be neglected (threshold = (1/k)*100%)
[sample_to_component, pixel_to_component, IDXs_Thresholded] = assign_regions(IDXs,K_ranked, pixel_to_sample_ID(goodlist == 1));
nr_comps = length(unique(pixel_to_component))-1;
Compact_Subpopulations = Subpopulations_Patients(sample_to_component); % This is another organization sample_to_component
% Color the tSNE scatter map based on the identified clusters (i.e. subpopulations)
close all;
[ColorMap, colstr] = MyColorMap(nr_comps); 
 plots_dir = pwd;
colmap = jet; close;
 RGB_COLORS_OfClusters = colmap(ColorMap,:)*255
 figure, scatter3(mappedX2_3D(:,1),mappedX2_3D(:,2),mappedX2_3D(:,3),3,IDXs); 
 colormap(RGB_COLORS_OfClusters./255);grid off 
 %% 3. Link to Clinical Data:
 % Assign the contribution of the different two survival classes(1&2) in each cluster K
clear Kstring;
[N_Subpop_pN0, N_Subpop_pN1] = assign_survivalClassesToClusters(Clinical_data, goodlist, pixel_to_sample_ID,sample_to_component)
for i=1:K_Clusters
Kstring(i) = {strcat('K',num2str(i))};
end
% Group Histogram
GG = [];
for i = 1:K_ranked
GG = [GG; N_Subpop_pN0(i) N_Subpop_pN1(i)];
end
close; colmap = jet; close;
colors = colmap(Color_Scheme,:).*255;

RGB_COLORS_OfClusters = [0.83 0.82 0.78; 0 0 0];
figure, bar(GG,'stacked');
colormap(RGB_COLORS_OfClusters);
xlabel('Cluster subpopulations');
ylabel('Counts');
legend({'pN=0','pN=1'},'FontSize',14,'FontWeight','bold');
%% 4. Investigate Statistical Significance: Fisher Exact test:
[ Sig,PValue,ContigenMatrix ] = FisherExactTest( N_Subpop_pN0 ,N_Subpop_pN1)
%% 5. SAM: Cluster_ID: represents the tumor subpopulations we are interested to retrieve its prognostic signature
clear msdata_average_CombinedSubpop
Compact_Subpopulations = Subpopulations_Patients(sample_to_component); 
SelectedSubpop = unique(IDXs);
MultiClass_MultiLabeling = 0;
Only_Metas_Sub = 7; %This is the ID of metastatic exclusive cluster.Look at (N_Subpop_pN0 ,N_Subpop_pN1)
indxoo = indx;
IDXs_Values = IDXs_Thresholded;
SAM_Breast
